####Dan Hopkins
####Blogs Project
####R Code October 22 2006

###condor_submit_util -i optimalbush050807.R -f -N -n 1
###set figure
fg <- "Bush"

###sample size
MM <- 2150
n.samp <- 1

### number of symptoms 
sym <- 15

#########automated
library(VA)

###load data
dta <-  read.csv("/nfs/fs1/projects/poliblog/global.txt",sep=",",na=c("NA","NC","<NA>",""),header=T)

###subset figure of interest
dta2 <- dta[dta$figure==fg,]

###CREATE PERMUTED CODINGS

dta2$jointcodeKW <- dta2$keneshiawashington.coding
dta2$jointcodeKW[dta2$keneshiawashington.coding %in% c(NA)] <- dta2$keneshiawashington.relev[dta2$keneshiawashington.coding %in% c(NA)]+2

dta2$jointcodeKC <- dta2$kcolton.coding
dta2$jointcodeKC[dta2$kcolton.coding %in% c(NA)] <- dta2$kcolton.relev[dta2$kcolton.coding %in% c(NA)]+2

dta2$jointcodeMK <- dta2$mknowles.coding
dta2$jointcodeMK[dta2$mknowles.coding %in% c(NA)] <- dta2$mknowles.relev[dta2$mknowles.coding %in% c(NA)]+2

dta2$jointcodeNH <- dta2$nickhayes.coding
dta2$jointcodeNH[dta2$nickhayes.coding %in% c(NA)] <- dta2$nickhayes.relev[dta2$nickhayes.coding %in% c(NA)]+2

dta2$jointcodeAP <- dta2$andrewprokop.coding
dta2$jointcodeAP[dta2$andrewprokop.coding %in% c(NA)] <- dta2$andrewprokop.relev[dta2$andrewprokop.coding %in% c(NA)]+2


###matrix of codings

codemat <- cbind(dta2$jointcodeAP,dta2$jointcodeNH,dta2$jointcodeMK,dta2$jointcodeKC,dta2$jointcodeKW)

###function to translate codings
conv.func <- function(mat){
  n<-dim(mat)[1]
  vec <- c()
  for(i in 1:n){
    cd <- mat[i,]
    if(sum(! cd %in% c(NA))==1)
      res <- cd[which(! cd %in% c(NA))]
    if(sum(! cd %in% c(NA))==0)   
      res <- NA
    
    if(any(c(-2,-1,0,1,2) %in% cd))
      res <- mean(cd[which(cd %in% c(-2,-1,0,1,2))])
    
    if( min(cd,na.rm=T)>2 & ! (sum(cd %in% c(NA))==5    ))
      res <- min(cd,na.rm=T)
    vec <- c(vec,res)
  }

  return(vec)
}

dta2$code1<-conv.func(codemat)

###RANDOMLY BREAK TIES
dta2$code2 <- NA

aa <- c(-.01,.01)
ss <- sample(aa,length(dta2$code1[dta2$code1 %in% c(-1.5,-.5,.5,1.5)]),replace=T)
dta2$code2 <- NA
dta2$code1[dta2$code1 %in% c(-1.5,-.5,.5,1.5)] <- dta2$code1[dta2$code1 %in% c(-1.5,-.5,.5,1.5)]+ss

dta2$code2[dta2$code1 <= -1.5 & ! dta2$code1 %in% c(NA)] <- -2
dta2$code2[dta2$code1 > -1.5 & dta2$code1 <= -.5 & ! dta2$code1 %in% c(NA)] <- -1
dta2$code2[dta2$code1 > -.5 & dta2$code1 <= .5 & ! dta2$code1 %in% c(NA)] <- 0
dta2$code2[dta2$code1 > .5 & dta2$code1 <= 1.5 & ! dta2$code1 %in% c(NA)] <- 1
dta2$code2[dta2$code1 > 1.5 & dta2$code1 < 2.5 & ! dta2$code1 %in% c(NA)] <- 2
dta2$code2[dta2$code1 > 2 & ! dta2$code1 %in% c(NA)] <- dta2$code1[dta2$code1 > 2 & ! dta2$code1 %in% c(NA)]

####create new codes
set.seed(54321)
rs <- sample(1:dim(dta2)[1],MM,replace=F)

resmat <- tmat <- actmat <-matrix(NA,n.samp,7)
ccc <- 1
nsy <- sym
dta3 <- dta2[,order(colnames(dta2))]
first4 <- substr(colnames(dta3),start=1,stop=4)
st<-min(which(first4=="WORD"))
fn <-max(which(first4=="WORD")) 

nwords <-fn - st + 1

dta.BUSH <- as.data.frame(1*(dta3[,st:fn]>0))
dta.BUSH$jointcode <- dta3$code2
dta.BUSHf <- na.omit(dta.BUSH)

### identify first, last word
sttxt <- colnames(dta.BUSHf)[1]
fntxt <- colnames(dta.BUSHf)[nwords]

### calculate weights
nn<-apply(dta.BUSHf[,1:nwords],2,sum)/dim(dta.BUSHf)[1]
nn2 <- nn
nn2[nn>.99 | nn < .01] <- 0
wts <- nn2/sum(nn2)

### randomly sample training set
train <- dta.BUSHf[rs,]
  
####remove half of -2 in test set
test <- dta.BUSHf
nnn <- dim(test)[1]
vec11 <- which(test$jointcode==4)
rs2 <- sample(1:length(vec11),round(length(vec11)/2,digits=0),replace=F)
vec22 <- vec11[rs2]
testA <- test[-vec22,]

### remove invariant columns
v1<-apply(train,2,sd)
remov1<-which(v1==0)

v2<-apply(testA,2,sd)
remov2<-which(v2==0)

train2 <- train[,-c(remov1,remov2)]
test2 <- testA[,-c(remov1,remov2)]
wts2 <- wts[-c(remov1,remov2)]

### estimate VA 
txt <- paste("vout1<-va(cbind(",sttxt,"+...+",fntxt,")~jointcode,data=list(train,testA),nsymp=nsy,n.subset=300,prob.wt=wts,boot.se=T,printit=F)",sep="")
eval(parse(text=txt))
resmat[ccc,] <- vout1$est.CSMF
actmat[ccc,1] <- round(mean(1*(test2$jointcode==-2)),digits=6)
actmat[ccc,2] <- round(mean(1*(test2$jointcode==-1)),digits=6)
actmat[ccc,3] <- round(mean(1*(test2$jointcode==0)),digits=6)
actmat[ccc,4] <- round(mean(1*(test2$jointcode==1)),digits=6)
actmat[ccc,5] <- round(mean(1*(test2$jointcode==2)),digits=6)
actmat[ccc,6] <- round(mean(1*(test2$jointcode==3)),digits=6)
actmat[ccc,7] <- round(mean(1*(test2$jointcode==4)),digits=6) 

save(actmat,resmat,sym,vout1,file="/nfs/fs1/projects/poliblog/replication/blogs/optim030308BUSH15.Rdata")


